In [76]:
suppressPackageStartupMessages({
    library(Seurat, quietly = T)
    library(HGNChelper, quietly = T)
    
    library(dplyr, quietly = T)
    library(openxlsx, quietly = T)
    library(reshape2, quietly = T)
    library(data.table, quietly = T)    
    
    library(ggplot2, quietly = T)
    library(ggraph, quietly = T)
    library(ggpubr, quietly = T)
    library(igraph, quietly = T)
    library(RColorBrewer, quietly = T)

})

data_path = '/data3/hratch/norcross_abc/'
sctype_path = '/data2/hratch/Software/'
In [48]:
abc.integrated<-readRDS(file = paste0(data_path, 'interim/abc_integrated.RDS'))
md<-abc.integrated@meta.data

Automated Annotation¶

Automatically annotate each cluster based on markers using ScType

According the the GitHub tutorial, you can used the scaled, integrated counts matrix as input

First, I cloned the github repo (commit ID 36e298c49a57846c8de3f8d1ee58f753d3a9a2a0) into /data2/hratch/Software:

In [3]:
# git clone git@github.com:IanevskiAleksandr/sc-type.git --> commit ID 36e298c49a57846c8de3f8d1ee58f753d3a9a2a0
In [49]:
# load sc-type functions from locally cloned repo (ensures consistency with version)

source(paste0(sctype_path, 'sc-type/', 'R/gene_sets_prepare.R'))
source(paste0(sctype_path, 'sc-type/', 'R/sctype_score_.R'))       
db_<-paste0(sctype_path, 'sc-type/', 'ScTypeDB_full.xlsx')

# # load from remote      
# source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
# source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")
# db_ = "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_full.xlsx"
In [2]:
path_to_db_file = db_
cell_type = "Immune system" 

cell_markers = openxlsx::read.xlsx(path_to_db_file)
cell_markers = cell_markers[cell_markers$tissueType == cell_type,] 
cell_markers$geneSymbolmore1 = gsub(" ","",cell_markers$geneSymbolmore1); cell_markers$geneSymbolmore2 = gsub(" ","",cell_markers$geneSymbolmore2)
In [6]:
tissue = "Immune system" 
# prepare gene sets
suppressWarnings({
    suppressMessages({
        gs_list = gene_sets_prepare(db_, tissue)
    })
})

The cells can be annotated as any of the following cell types:

In [7]:
intersect(names(gs_list$gs_positive), names(gs_list$gs_negative))
  1. 'Pro-B cells'
  2. 'Pre-B cells'
  3. 'Naive B cells'
  4. 'Memory B cells'
  5. 'Plasma B cells'
  6. 'Naive CD8+ T cells'
  7. 'Naive CD4+ T cells'
  8. 'Memory CD8+ T cells'
  9. 'Memory CD4+ T cells'
  10. 'Effector CD8+ T cells'
  11. 'Effector CD4+ T cells'
  12. 'γδ-T cells'
  13. 'Platelets'
  14. 'CD8+ NKT-like cells'
  15. 'CD4+ NKT-like cells'
  16. 'Natural killer cells'
  17. 'Eosinophils'
  18. 'Neutrophils'
  19. 'Basophils'
  20. 'Mast cells'
  21. 'Classical Monocytes'
  22. 'Non-classical monocytes'
  23. 'Intermediate monocytes'
  24. 'Macrophages'
  25. 'Megakaryocyte'
  26. 'Endothelial'
  27. 'Erythroid-like and erythroid precursor cells'
  28. 'HSC/MPP cells'
  29. 'Progenitor cells'
  30. 'Myeloid Dendritic cells'
  31. 'Plasmacytoid Dendritic cells'
  32. 'Granulocytes'
  33. 'ISG expressing immune cells'
  34. 'Cancer cells'
In [56]:
# get cell-type by cell matrix (scores each cell for each cell type)
es.max = sctype_score(scRNAseqData = abc.integrated@assays$integrated@scale.data, 
                      scaled = TRUE, 
                      gs = gs_list$gs_positive, gs2 = gs_list$gs_negative, 
                      gene_names_to_uppercase = TRUE) 

# merge by cluster by taking the sum of the scores of each cell type
cL_resutls = do.call("rbind", lapply(unique(md$seurat_clusters), function(cl){
    es.max.cl = sort(rowSums(es.max[ ,rownames(md[md$seurat_clusters==cl, ])]), decreasing = !0)
    head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(md$seurat_clusters==cl)), 10)
}))

write.csv(cL_resutls, 
         paste0(data_path, 'interim/celltype_cluster_scores.csv'))

# take max score of each cluster
sctype_scores = cL_resutls %>% group_by(cluster) %>% top_n(n = 1, wt = scores)  
sctype_scores[['freq']]<-100*sctype_scores$ncells/sum(sctype_scores$ncells)

sctype_scores[['Confidence']]<-'good'
sctype_scores$Confidence[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "bad"


sctype_scores[with(sctype_scores, order(cluster)), ]
A grouped_df: 27 × 6
clustertypescoresncellsfreqConfidence
<fct><chr><dbl><int><dbl><chr>
0 Naive CD8+ T cells 8409.80958555617.5406472good
1 Pre-B cells 12057.23958349611.0370955good
2 Naive CD4+ T cells 3909.461622579 8.1420679good
3 Memory CD8+ T cells 5769.909042476 7.8168903good
4 Myeloid Dendritic cells 4163.551922356 7.4380426good
5 Basophils 1110.359012089 6.5951066good
6 Effector CD8+ T cells 4112.453301645 5.1933702good
7 Macrophages 3757.402041574 4.9692186good
8 Non-classical monocytes 1206.32986 988 3.1191792good
9 Myeloid Dendritic cells 2178.10713 967 3.0528808good
10CD8+ NKT-like cells 2463.14662 961 3.0339384good
11Plasmacytoid Dendritic cells 2303.04401 875 2.7624309good
12Eosinophils -75.54162 872 2.7529597bad
13Natural killer cells 91.08425 798 2.5193370bad
14Myeloid Dendritic cells 776.45560 727 2.2951855good
15CD4+ NKT-like cells 782.70961 549 1.7332281good
16Progenitor cells 1029.58665 478 1.5090766good
17Macrophages 2832.49822 477 1.5059195good
18Myeloid Dendritic cells 162.83592 472 1.4901342good
19Natural killer cells 2509.25371 334 1.0544594good
20Pre-B cells 920.31825 323 1.0197316good
21Effector CD4+ T cells 1353.22775 314 0.9913181good
22Plasmacytoid Dendritic cells 735.13280 310 0.9786898good
23Neutrophils 2067.02922 217 0.6850829good
24γδ-T cells 235.62306 134 0.4230466good
25Naive B cells 723.09840 69 0.2178374good
26Pre-B cells 117.54717 39 0.1231255good

A cluster annotation has "low" confidence if the ScType score is < 0 or < 0.25*# of cells in that cluster. ScType annotates these as unknown, s.t. seemingly not enough information is available in the transcriptomics data and markers database. We will want to double check these and perhaps keep them labelled as Unknown.

In [57]:
unique(sctype_scores$type)
  1. 'Naive CD8+ T cells'
  2. 'Pre-B cells'
  3. 'Naive CD4+ T cells'
  4. 'Myeloid Dendritic cells'
  5. 'CD8+ NKT-like cells'
  6. 'Natural killer cells'
  7. 'CD4+ NKT-like cells'
  8. 'Plasmacytoid Dendritic cells'
  9. 'Memory CD8+ T cells'
  10. 'Effector CD4+ T cells'
  11. 'Effector CD8+ T cells'
  12. 'Basophils'
  13. 'Macrophages'
  14. 'γδ-T cells'
  15. 'Eosinophils'
  16. 'Non-classical monocytes'
  17. 'Neutrophils'
  18. 'Progenitor cells'
  19. 'Naive B cells'
In [58]:
# assign cell types to barcodes 
mapper<-setNames(sctype_scores$type, sctype_scores$cluster)
md[['Cell.Type.ScType']]<-unlist(unname(mapper[as.character(md$seurat_clusters)]))


co<-c('Progenitor cells', 'Naive B cells', 'Pre-B cells', 'γδ-T cells', 
      'Naive CD4+ T cells', 'Effector CD4+ T cells', 
      'Naive CD8+ T cells', 'Effector CD8+ T cells', 'Memory CD8+ T cells',
      'CD4+ NKT-like cells', 'CD8+ NKT-like cells', 'Natural killer  cells',
     'Myeloid Dendritic cells', 'Plasmacytoid Dendritic cells', 
      'Non-classical monocytes', 'Macrophages',
     'Basophils', 'Eosinophils', 'Neutrophils')

md[['Cell.Type.ScType']]<-factor(x = md$Cell.Type.ScType,levels = co)
# retain individual barcode scores
md<-cbind(md, t(es.max[, rownames(md)]))
abc.integrated@meta.data<-md

Let's see what the distribution of scores was for the annotations:

In [59]:
viz.df<-md[!colnames(md) %in% colnames(md)[c(1,2,3,4,5, 7)]]
viz.df<-melt(viz.df, value.name = 'Annotation.Score')
colnames(viz.df)<-c('seurat_clusters', 'Cell.Type', 'Annotation.Score')

h_ = 50
w_ = 30
options(repr.plot.height=h_, repr.plot.width=w_)
g1<-ggplot(data = viz.df, aes(x = Cell.Type, y = Annotation.Score)) + geom_violin() + 
facet_wrap(~seurat_clusters, ncol = 2)+
theme(axis.text.x = element_text(angle = 45))

g1
Warning message in melt(viz.df, value.name = "Annotation.Score"):
“The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(viz.df). In the next version, this warning will become an error.”
Using seurat_clusters as id variables

In [75]:
# prepare edges
cL_resutls=cL_resutls[order(cL_resutls$cluster),]; edges = cL_resutls; edges$type = paste0(edges$type,"_",edges$cluster); edges$cluster = paste0("cluster ", edges$cluster); edges = edges[,c("cluster", "type")]; colnames(edges) = c("from", "to"); rownames(edges) <- NULL

# prepare nodes
nodes_lvl1 = sctype_scores[,c("cluster", "ncells")]; nodes_lvl1$cluster = paste0("cluster ", nodes_lvl1$cluster); nodes_lvl1$Colour = "#f1f1ef"; nodes_lvl1$ord = 1; nodes_lvl1$realname = nodes_lvl1$cluster; nodes_lvl1 = as.data.frame(nodes_lvl1); nodes_lvl2 = c(); 
ccolss= c("#5f75ae","#92bbb8","#64a841","#e5486e","#de8e06","#eccf5a","#b5aa0f","#e4b680","#7ba39d","#b15928","#ffff99", "#6a3d9a","#cab2d6","#ff7f00","#fdbf6f","#e31a1c","#fb9a99","#33a02c","#b2df8a","#1f78b4","#a6cee3")
for (i in 1:length(unique(cL_resutls$cluster))){
  dt_tmp = cL_resutls[cL_resutls$cluster == unique(cL_resutls$cluster)[i], ]; nodes_lvl2 = rbind(nodes_lvl2, data.frame(cluster = paste0(dt_tmp$type,"_",dt_tmp$cluster), ncells = dt_tmp$scores, Colour = ccolss[i], ord = 2, realname = dt_tmp$type))
}
nodes = rbind(nodes_lvl1, nodes_lvl2); nodes$ncells[nodes$ncells<1] = 1;
files_db = openxlsx::read.xlsx(db_)[,c("cellName","shortName")]; files_db = unique(files_db); nodes = merge(nodes, files_db, all.x = T, all.y = F, by.x = "realname", by.y = "cellName", sort = F)
nodes$shortName[is.na(nodes$shortName)] = nodes$realname[is.na(nodes$shortName)]; nodes = nodes[,c("cluster", "ncells", "Colour", "ord", "shortName", "realname")]

mygraph <- graph_from_data_frame(edges, vertices=nodes)

# Make the graph
gggr<- ggraph(mygraph, layout = 'circlepack', weight=I(ncells)) + 
  geom_node_circle(aes(filter=ord==1,fill=I("#F5F5F5"), colour=I("#D3D3D3")), alpha=0.9) + geom_node_circle(aes(filter=ord==2,fill=I(Colour), colour=I("#D3D3D3")), alpha=0.9) +
  theme_void() + geom_node_text(aes(filter=ord==2, label=shortName, colour=I("#ffffff"), fill="white", repel = !1, parse = T, size = I(log(ncells,25)*1.5)))+ geom_node_label(aes(filter=ord==1,  label=shortName, colour=I("#000000"), size = I(3), fill="white", parse = T), repel = !0, segment.linetype="dotted")


h_ = 10
w_ = 10
options(repr.plot.height=h_, repr.plot.width=w_)
gggr
# scater::multiplot(DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE, cols = ccolss), gggr, cols = 2)
Non-leaf weights ignored
Warning message:
“Ignoring unknown aesthetics: fill, repel, parse”
Warning message:
“Ignoring unknown aesthetics: parse”

Let's look at distributions of scores for the cell type that was actually assigned to each cluster:

In [60]:
# filter for the score that was assigned
retain.idx<-c()
for (cluster in names(mapper)){
    retain.idx<-c(retain.idx, 
                  rownames(viz.df[(viz.df$seurat_clusters == cluster) & (viz.df$Cell.Type == mapper[cluster]), ]))
}
viz.df<-viz.df[retain.idx,]

h_ = 5
w_ = 15
options(repr.plot.height=h_, repr.plot.width=w_)
g2<-ggplot(data = viz.df, aes(x = seurat_clusters, y = Annotation.Score)) + geom_violin() 

g2
In [16]:
h_ = 8
w_ = 16
options(repr.plot.height=h_, repr.plot.width=w_)
g3a <- DimPlot(abc.integrated, reduction = "umap", group.by = "seurat_clusters", shuffle = T, label = T)+
ggtitle('Cluster')
g3c <- DimPlot(abc.integrated, reduction = "umap", group.by = "Cell.Type.ScType", shuffle = T,  label = F)+
ggtitle('Cell Type')

g3<-g3a + g3c
# for (ext in c('.svg', '.png', '.pdf')){ggsave(paste0(data_path, 'figures/processing/', 
#                                                      'UMAP_cluster_celltype', ext), g3)}     
g3

Let's see whether the many-to-one mappings from cluster to cell type label are for clusters near each other in UMAP space or far apart. We will want to double check the ones that are far apart.

In [30]:
md<-abc.integrated@meta.data
md_og<-abc.integrated@meta.data


cluster.label<-'seurat_clusters'
unique.cell.cluster<-md[!duplicated(md[c(cluster.label, 'Cell.Type.ScType')]), ]
celltypes.multiple<-names(which(table(unique.cell.cluster$Cell.Type.ScType) > 1))
print('The following cells are labelled in more than one cluster: ')
celltypes.multiple
[1] "The following cells are labelled in more than one cluster: "
  1. 'Pre-B cells'
  2. 'Natural killer cells'
  3. 'Myeloid Dendritic cells'
  4. 'Plasmacytoid Dendritic cells'
  5. 'Macrophages'
In [31]:
# label according to both 
parse.cluster<-function(y, x){
    if (y %in% celltypes.multiple){
        return(paste0(y, ' - ', x))
    }else{
        return(as.character(y))
    }
}
md[['Cell.Type.uniquecluster']]<-mapply(parse.cluster, md$Cell.Type.ScType, md[[cluster.label]])


co<-c('Progenitor cells', 'Naive B cells', 'Pre-B cells - 1', 'Pre-B cells - 20', 'Pre-B cells - 26', 'γδ-T cells', 
      'Naive CD4+ T cells', 'Effector CD4+ T cells', 
      'Naive CD8+ T cells', 'Effector CD8+ T cells', 'Memory CD8+ T cells',
      'CD4+ NKT-like cells', 'CD8+ NKT-like cells', 'Natural killer  cells - 19', 'Natural killer  cells - 13', 
     'Myeloid Dendritic cells - 4', 'Myeloid Dendritic cells - 9', 'Myeloid Dendritic cells - 14', 
      'Myeloid Dendritic cells - 18', 
      'Plasmacytoid Dendritic cells - 11', 'Plasmacytoid Dendritic cells - 22',
      'Non-classical monocytes', 'Macrophages - 7','Macrophages - 17',
     'Basophils', 'Eosinophils', 'Neutrophils')
md[['Cell.Type.uniquecluster']]<-factor(x = md$Cell.Type.uniquecluster,levels = co)


mapper<-setNames(1:length((unique(md$Cell.Type.uniquecluster))), sort(unique(md$Cell.Type.uniquecluster)))
md[['Cell.Type.forUMAP']]<-unname(unlist(mapper[md[['Cell.Type.uniquecluster']]]))
md[['Cell.Type.forUMAP']]<-factor(x = md$Cell.Type.forUMAP,
                                        levels = sort(unique(md[['Cell.Type.forUMAP']])))
legend.labels<-paste0(unname(unlist(mapper)), ': ', names(mapper))

abc.integrated@meta.data<-md

Annotation: "#: Cell Type - Cluster Label"

  • "#" maps the label on the UMAP plot to the legend
  • "Cell Type" is the annotated cell type
  • "Cluster Label" is the "seurate_cluster_let" label, if the cell type mapped to more than one cluster
In [38]:
h_ = 7
w_ = 16
options(repr.plot.height=h_, repr.plot.width=w_)

colors = c(brewer.pal(n=12, "Set3"),
           brewer.pal(n=8, "Dark2"), 
           brewer.pal(n=7, "Accent"))
theme = theme(panel.background = element_blank(), panel.border = element_rect(colour = 'black', fill = 'NA'),
         text = element_text(size=28), panel.spacing = unit(1.15, "lines"), 
          axis.title=element_text(size=28), legend.text=element_text(size=16), 
         legend.title=element_text(size=20), axis.text.x = element_text(size = 22), 
             axis.text.y = element_text(size = 22)) 

g4<-DimPlot(abc.integrated, reduction = 'umap',  
            group.by = 'Cell.Type.forUMAP', label = T)+
    xlab('UMAP 1')+ylab('UMAP 2')+
    scale_color_manual(values = colors, labels = legend.labels) + labs(color='Cell Type')+theme+
    ggtitle('Cell Types (Clusters distinguished)')
g4

Pre-b cells, myeloid DCs, and macrophages cluster together.

pDCs are near each other but not directly adjacent.

NK Cells do not cluster together. Note, one of the NK cell labels maps to cluster 13, which was a low confidence label from ScType.

In [96]:
h_ = 7
w_ = 16
options(repr.plot.height=h_, repr.plot.width=w_)
gQC1<-ggplot(data = abc.integrated@meta.data, 
             aes(y = percent.mt, x = seurat_clusters, fill = seurat_clusters))+geom_violin()+
            scale_y_continuous(breaks = round(seq(0, max(md[['percent.mt']]), by = 5),1))+
            theme_bw()
gQC1
In [100]:
h_ = 7
w_ = 16
options(repr.plot.height=h_, repr.plot.width=w_)
gQC2<-ggplot(data = abc.integrated@meta.data, 
             aes(y = nCount_RNA, x = seurat_clusters, fill = seurat_clusters))+geom_violin()+
            scale_y_continuous(breaks = round(seq(0, max(md[['nCount_RNA']]), by = 1e4),1))+
            theme_bw()
gQC2

Looking at the QC metrics by cluster, along witht the lack of annotation for clusters 12 and 13, we may consider dropping these ones.

Dotplot¶

In [42]:
abc.integrated@meta.data<-md_og

Dotplot of each marker to double check that annotated cell types make sense.

In [86]:
markers<-unname(unlist(gs_list$gs_positive))

print(paste0('The total number of unique markers is: ', length(markers)))

markers.present<-intersect(markers, toupper(rownames(abc.integrated)))
print(paste0('The total number of unique markers in the HVGs: ', length(markers.present)))
fwrite(as.list(markers.present), paste0(data_path, 'interim/scType_pos_markers.txt'))
[1] "The total number of unique markers is: 720"
[1] "The total number of unique markers in the HVGs: 114"
In [88]:
markers.present<-rownames(abc.integrated)[toupper(rownames(abc.integrated)) %in% markers] # keep mouse gene name
In [91]:
h_ = 12
w_ = 45
options(repr.plot.height=h_, repr.plot.width=w_)

theme = theme(panel.background = element_blank(), 
              panel.border = element_rect(colour = 'black', fill = 'NA'),
         text = element_text(size=28), panel.spacing = unit(1.15, "lines"), 
          axis.title=element_text(size=23), legend.text=element_text(size=16), 
         legend.title=element_text(size=22), axis.text.y = element_text(size = 18), 
              legend.key = element_blank(),
             axis.text.x = element_text(size = 16))
suppressWarnings({
    g<-DotPlot(abc.integrated, features = markers.present, cols = c('blue', 'red'), cluster.idents = T) + 
    RotatedAxis()+
    xlab('Marker Gene') + ylab('Cluster') + theme + ggtitle("scType markers")
})

# for (ext in c('.svg', '.png', '.pdf')){ggsave(paste0(data_path, 'figures/processing/', 
#                                                      'scType_marker_dotplot', ext), g, 
#                                              height = h_, width = w_)}

g

Cluster Markers¶

Identify markers for each cluster to double check that annotated cell types make sense and re-annotate low-confidence cell types/those that were annotated across disparate clusters.

According to this study, there is not high concordance in methods, but Wilcoxon rank-sum is the most commonly used (and Seurat's default) and there is no need to account for confounders since we are considering all cells across batch.

In [101]:
# # Wilcoxon
# # normalized counts as recommended by 
# # https://github.com/satijalab/seurat/issues/2636
# # https://github.com/satijalab/seurat/discussions/4000
# markers.wilcoxon<-FindAllMarkers(object = abc.integrated, assay = 'RNA', only.pos = T, 
#                                 slot = 'data', test.use = 'wilcox', 
#                                 min.pct = 0.25, # stringent since markers
#                                 logfc.threshold = 0.5 
#                                   )
# saveRDS(markers.wilcoxon, paste0(data_path, 'interim/cluster_markers_wilcoxon.RDS'))

markers<-readRDS(paste0(data_path, 'interim/cluster_markers_wilcoxon.RDS'))
In [126]:
markers<-readRDS(paste0(data_path, 'interim/cluster_markers_wilcoxon.RDS'))
In [129]:
# format markers
markers.excel<-function(marker, de.type){
    markers_workbook<-createWorkbook()
    for (cluster in sort(unique(marker$cluster))){
        de.res.cl<-as.data.frame(marker[marker$cluster == cluster, ])
        rownames(de.res.cl)<-1:dim(de.res.cl)[[1]]

    #     # make infinites characlers, otherwise writes as NULL
    #     idx_<-as.numeric(rownames(de.res.cl[is.infinite(de.res.cl$LFC), ]))

        addWorksheet(markers_workbook, cluster)
        writeData(markers_workbook, sheet = cluster, x = de.res.cl)
    }
    saveWorkbook(markers_workbook, overwrite = T, 
                 paste0(data_path, 'interim/', de.type, '_markers.xlsx'))
}

counter<-1
suppressMessages({
    suppressWarnings({
        de.type<-'wilcoxon'
        markers<-markers[markers$p_val_adj <= 0.1,] # threshold on p_adj
        markers<-markers[markers$avg_log2FC > 0.75,] # further threshold on LFC
        markers<-markers[with(markers, order(cluster, -avg_log2FC)), ] # sort by effect size

        markers[['scType.annotation']]<-unlist(unname(mapper[as.character(markers$cluster)]))

        markers.excel(markers, de.type) # save to excel file

    })
})
In [132]:
saveRDS(abc.integrated, 
        file = paste0(data_path, 'interim/abc_sctype_annotated.RDS'))